## "replication_preparation.R" from replication package for "Registering Returning Citizens to Vote"
## This script shows the creation of the main deidentified analysis dataset as well as several additional datasets
## It shows the merge process used to combine various identifiable datasets. We do not provide those underlying datasets
## because of the sensitivity of criminal records, but we provide this code for transparency about the record-linkage process

# > sessionInfo()
# R version 4.1.3 (2022-03-10)
# Platform: x86_64-apple-darwin17.0 (64-bit)
# Running under: macOS Catalina 10.15.7


rm(list=ls())
library(data.table)
library(stringdist)
library(tidyverse)
library(readxl)
library(wru)
library(dplyr)

############################################################################################
## load voter file
############################################################################################
# load in voter file & trim down to new registrants
voters <- fread("../Data/mainstudyfall2020/NCvoterdata/ncvoter_Statewide.txt",  
                header=TRUE, colClasses=c(rep("character", 71)), 
                select=c("county_desc", 	"voter_reg_num", "name_prefx_cd", "first_name", "middle_name", "last_name", "name_suffix_lbl", "birth_year", "birth_age", "voter_status_desc", "voter_status_reason_desc", "res_street_address", "res_city_desc", "state_cd", "zip_code","mail_addr1", "mail_addr2", "mail_addr3", "mail_addr4", "mail_city", "mail_state", 		"mail_zipcode","registr_dt", "race_code","ethnic_code","party_cd","gender_code","birth_state")) 

# clean data format and choose relevant registration date
voters[, registrationdate := as.Date(registr_dt, "%m/%d/%Y")]
newvoters <- voters[registrationdate >= "2020-08-01" & registrationdate <= "2020-11-03",] 

# clean some columns
newvoters[, voterID := paste(county_desc, voter_reg_num, sep="")] #stick county name and voter reg 
newvoters[, firstinitial:= substr(first_name, 1, 1)]
newvoters[, birth_year := as.numeric(birth_year)]

# rm(voters) #free up some memory

############################################################################################
## match the voter file with voting history 
############################################################################################
# also pull in 2020 vote hist and add that in before merge. 
votehist <- fread("../Data/mainstudyfall2020/NCvoterdata/ncvhis_Statewide.txt", 
                  header=TRUE, colClasses=c(rep("character", 15)),
                  select=c("county_desc", "voter_reg_num", "election_lbl", "election_desc", "voted_party_cd"))

# stick county name and voter reg 
votehist[, voterID := paste(county_desc, voter_reg_num, sep="")]

# get voting information on 2020 election 
votes2020gen <- votehist[votehist$election_lbl=="11/03/2020",]
dim(votes2020gen) #should be 5.54mil votes, according to NCSBE

# should be all 100 counties reporting
counties <- table(votes2020gen$county_desc); dim(counties)

# assign voting record information 
newvoters[, voted2020gen:=0]
newvoters[voterID %in% votes2020gen$voterID, voted2020gen:=1] 
summary(newvoters$voted2020gen)

newvoters[, onlinefieldsmiss:= 0]
newvoters[gender_code=="U" & race_code=="U" & birth_state=="", onlinefieldsmiss:= 1]

# clean names before merging
# trim whitespace out of names
newvoters[, first_namet:= trimws(first_name)] 
newvoters[, last_namet:= trimws(last_name)]
newvoters[, middle_namet:= trimws(middle_name)]
newvoters[, fullname:= paste(first_name, middle_name, last_name, sep=" ")]


# saveRDS("tmp/NC_newvoters_merged_vh.rds", object = newvoters)
# newvoters <- readRDS("tmp/NC_newvoters_merged_vh.rds")


############################################################################################
## experimental data setup
############################################################################################
mainasst <- read.csv("../Data/mainstudyfall2020/mainstudyfall2020_fulltreatmentasstdataset.csv")

# get NC data
NC_original <- mainasst[mainasst$list == "NCmainlist",]

# start to fill in some simple covars. 
# was this person incarcerated, on supervision, or both?
# if you have no last name from the inmate file, you don't have an inmate record
NC_original$pastincarc <- ifelse(is.na(NC_original$CICLSTNM), 0, 1) 
NC_original$lastincarcend <- as.Date(NC_original$CILAMVDT, "%m/%d/%Y")

# create a data table for merging
NCsamp <- data.table(NC_original)

# line up some colnames:
NCsamp[, mail_addr1 := TP.Address.1]
NCsamp[, mail_city := TP.City.1]
NCsamp[, mail_zipcode := as.character(TP.Zip.Code.1)]
NCsamp[, county_desc := TP.County.1]
NCsamp[, dpsname:= paste(first_name, last_name, sep=" ")]
NCsamp[, fullname:= paste(first_name, middle_name, last_name, sep=" ")]


############################################################################################
## merge the main data with the voter file 
############################################################################################

####################################
# 1. exact merging name/yob/address matches
####################################
variablenames <- c("last_name","first_name", "middle_name", "birth_year", "mail_addr1", "mail_city", "mail_zipcode", "county_desc")
exactmatches <- merge(newvoters, NCsamp, by = variablenames)
dim(exactmatches)

####################################
# 2. exact merging name/yob matches including middle name
####################################
exactnamematches1 <- merge(newvoters, NCsamp, by=c("last_name","first_name", "middle_name", "birth_year"))
dim(exactnamematches1)

#drop the matches we already found
exactnamematches1 <- exactnamematches1[!(CMDORNUM %in% exactmatches$CMDORNUM),]
dim(exactnamematches1); 
# head(subset(exactnamematches1, select=c("last_name","first_name", "middle_name", "birth_year", "mail_addr1.x", "mail_city.x", "mail_zipcode.x", "county_desc.x", "mail_addr1.y", "mail_city.y", "mail_zipcode.y", "county_desc.y")))

exactnamematches1[county_desc.x==county_desc.y, countymatch:=1]; table(exactnamematches1$countymatch) # most are in same county
exactnamematches1[mail_city.x==mail_city.y, citymatch:=1]; table(exactnamematches1$citymatch) #some in same city
exactnamematches1[mail_zipcode.x==mail_zipcode.y, zipmatch:=1]; table(exactnamematches1$zipmatch) #fewer in same zip
# length(unique(exactnamematches1$CMDORNUM)) #couple of duplicates
exactnamematches1 <- exactnamematches1[countymatch==1,] #add in a geographic limitation here (comment out if not using)


####################################
# 3. exact merging name/yob matches only with first and last name
####################################
firstlastnames <- merge(newvoters, NCsamp, by=c("last_name","first_name", "birth_year")) #drop middle name criterion
dim(firstlastnames)
firstlastnames1 <- firstlastnames[!(CMDORNUM %in% exactmatches$CMDORNUM) & !(CMDORNUM %in% exactnamematches1$CMDORNUM),] #drop the matches we already found
dim(firstlastnames1)

# the number of characters of middlen name
firstlastnames1[,mnamelengthx:= nchar(middle_name.x)] #how many characters in this field?
firstlastnames1[,mnamelengthy:= nchar(middle_name.y)] 
firstlastnames1[mnamelengthx==1 & middle_name.x == substr(middle_name.y,0,1), keep:=1] #initial-only matches initialized version: X/Y
firstlastnames1[mnamelengthy==1 & middle_name.y == substr(middle_name.x,0,1), keep:=1] # same but for Y/X

# keep the observations where an initial matches
firstlastnames2 <- firstlastnames1[keep==1,] #keep just the ones we flagged 
firstlastnames2[county_desc.x==county_desc.y, countymatch:=1]; table(firstlastnames2$countymatch) #most are in same county
firstlastnames2[mail_city.x==mail_city.y, citymatch:=1]; table(firstlastnames2$citymatch) #many in same city
firstlastnames2[mail_zipcode.x==mail_zipcode.y, zipmatch:=1]; table(firstlastnames2$zipmatch) #some in same zip
length(unique(firstlastnames2$CMDORNUM)) #again, just a handful of duplicates

# choose observations where county matches
firstlastnames2 <- firstlastnames2[countymatch==1,] #add in a geographic limitation here (comment out if not using)


####################################
# 4. merge based on yob/mail address/name similarity
####################################
# what about people who register at the same address we have, with same YOB and similar name?
exactaddressYOBmatches1 <- merge(newvoters, NCsamp, by=c("birth_year", "mail_addr1", "mail_city", "mail_zipcode"))
dim(exactaddressYOBmatches1) #lots, but probably lots of completely different names. 
exactaddressYOBmatches1[, fnamedist := stringdist(first_name.x, first_name.y, method="jw")] #calculate string distances between these names
exactaddressYOBmatches1[, mnamedist := stringdist(middle_name.x, middle_name.y, method="jw")] 
exactaddressYOBmatches1[, lnamedist := stringdist(last_name.x, last_name.y, method="jw")] 
summary(exactaddressYOBmatches1$lnamedist)
exactaddressYOBmatches1[, fullnamedist := stringdist(fullname.x, fullname.y, method="jw")] #in case people use maiden as middle name? we'll see

# only keep more similar names (threshold .25)
exactaddressYOBmatches2 <- exactaddressYOBmatches1[(fnamedist <.25 & lnamedist <.25) | fullnamedist <.25,]; dim(exactaddressYOBmatches2) #trim down to closer name matches
summary(exactaddressYOBmatches2$lnamedist) #note these are mostly exact matches, actually
#middle names trickier, since some are missing and some are only initials
exactaddressYOBmatches2[,mnamelengthx:= nchar(middle_name.x)] #how many characters in this field?
exactaddressYOBmatches2[,mnamelengthy:= nchar(middle_name.y)]
summary(exactaddressYOBmatches2$mnamelengthx)
exactaddressYOBmatches2[ (mnamelengthx >1 & mnamelengthy >1) & mnamedist >.1, mnamemismatch:=1]; table(exactaddressYOBmatches2$mnamemismatch)
#not actually many people at same address with same YOB and similar first/last but different middle; maybe this isn't surprising!
exactaddressYOBmatches3 <- exactaddressYOBmatches2[is.na(mnamemismatch),]; dim(exactaddressYOBmatches3)
# drop out the perfect matches we already caught. 
addressYOBmatches <- exactaddressYOBmatches3[!(CMDORNUM %in% exactmatches$CMDORNUM) & !(CMDORNUM %in% firstlastnames2$CMDORNUM) & !(CMDORNUM %in% exactnamematches1$CMDORNUM),]; dim(addressYOBmatches)
length(unique(addressYOBmatches$CMDORNUM)) # one duplicate.


####################################
# merge everything together
####################################
allmatches <- rbind(exactmatches, addressYOBmatches, exactnamematches1, firstlastnames2, fill=TRUE) #stick together all these match approaches
dim(allmatches); length(unique(allmatches$CMDORNUM)); dim(unique(allmatches))

# keep observations with active voter status
allmatches <- allmatches[voter_status_desc =="ACTIVE",]; dim(allmatches) #can comment out if we want to look at efforts to be reg.

####################################
# create some variables
####################################
NC <- NC_original
# registration status
NC$reg2020 <- ifelse(NC$CMDORNUM %in% allmatches$CMDORNUM, 1, 0) #then use it to fill in the registration variable

table(NC$reg2020) 

# registration date
NC$registrationdate <-allmatches$registrationdate[match(NC$CMDORNUM,allmatches$CMDORNUM)]  #pull registration date over as well (for the registered)
summary(NC$registrationdate) 
#can we pull in party as well for those who registered?
table(NC$party_cd)

# party ID
NC$party <- allmatches$party_cd[match(NC$CMDORNUM,allmatches$CMDORNUM)]

# turnout
NC$voted2020gen <- 0 #also, turnout
NC[NC$CMDORNUM %in% allmatches[allmatches$voted2020gen==1,]$CMDORNUM, "voted2020gen"] <- 1
table(NC$voted2020gen)

onlinereg <- allmatches[allmatches$onlinefieldsmiss==1,]; dim(onlinereg)
NC$registeredonline <- ifelse(NC$CMDORNUM %in% onlinereg$CMDORNUM, 1, 0)

### also, merging in RUCA data (urbanicity of zipcodes) in response to discussant Qs
ruca <- read.csv("../Data/RUCA2010zipcode.csv") #pull in these classifications and clean up/limit to NC
ruca$zipcode <- gsub("''", "", ruca$X..ZIP_CODE..)
### downloaded this list from the USDA here, 2/23/2022: https://www.ers.usda.gov/data-products/rural-urban-commuting-area-codes.aspx
### see documentation here https://www.ers.usda.gov/data-products/rural-urban-commuting-area-codes/documentation/
NCr <- merge(NC, ruca[, c("zipcode", "RUCA1")], by.x="mailingzip", by.y="zipcode"); dim(NCr); dim(NC) #merge together
#looks good, let's set this as main NC file
NC <- NCr

NC$pastincarc <- ifelse(is.na(NC$CICLSTNM), 0, 1) 
NC$lastincarcend <- as.Date(NC$CILAMVDT, "%m/%d/%Y")

NC$pastsupervision <- ifelse(is.na(NC$CDSLSTNM), 0, 1) #same logic for supervision

# saveRDS(object = NC, file = "tmp/NC_merged.rds")
# NC <- readRDS("tmp/NC_merged.rds")

############################################################################################
## TEXAS DATA SETUP
############################################################################################

# pull in the original main-study assignment dataset
# and subset to TX to produce the dataset for L2 to merge to TX voter file. 
TX <- mainasst[mainasst$list=="TXmainlist",] #TX only
# as annoying as it is for later merges, I think we should send them just a subset of columns--otherwise they'll get confused/annoyed
keepcols <- c("X.1", "X", "first_name", "middle_name", "last_name","TP.First.Name", "TP.Middle.Name", "TP.Last.Name", "dobdte","sexcod","raccod", "ethcod", "TP.Address.1", "TP.City.1", "TP.Zip.Code.1","TP.County.1")

TXforL2 <- TX[, keepcols]

### we sent this CSV to L2 for matching in late November. Next, we'll merge in what we got back from them. 
#L2TX <- read.csv("texas_L2match_november.csv") #load in L2 match. note that we'll want to update this once we get the later version with vote history in there. 
L2TX <- read.csv("../Data/mainstudyfall2020/texas_L2match_april.csv") #load in L2 match; now pulling in later version w/vote hist.

L2TX$officialregdate <- as.Date(L2TX$Voters_OfficialRegDate, format="%m/%d/%Y")
L2TX$calcregdate <- as.Date(L2TX$Voters_CalculatedRegDate, format="%m/%d/%Y")


TXdiag <- data.table(TXforL2)
TXdiag[, TPchangedlastname:=0];
TXdiag[last_name != TP.Last.Name, TPchangedlastname:=1]; summary(TXdiag$TPchangedlastname) #so a fair number. 
TXdiag[,TPchangedfirstname := stringi::stri_detect_fixed(TP.First.Name, first_name)] #need to see if 

TXdiag[, TPchangedanyname:=0]; TXdiag[TPchangedfirstname==T | TPchangedlastname==1, TPchangedanyname:=1]; 

L2TX_trim2 <- L2TX[is.na(L2TX$officialregdate)==F & L2TX$officialregdate < as.Date("2020-08-01"), ]; dim(L2TX_trim2) #early registrants
TX$prereg2020 <- ifelse(TX$Sequence %in% L2TX_trim2$X, 1, 0) #then use it to fill in the registration variable

#mark people from TX who were registered at the time we treated them due to L2 pre-assignment merge issue
L2TX_trim <- L2TX[is.na(L2TX$officialregdate)==F, ]; dim(L2TX_trim) 
L2TX_prereg <- L2TX[is.na(L2TX$officialregdate)==F & L2TX$officialregdate < as.Date("2020-08-01") , ]; dim(L2TX_prereg) 
TX$reg2020 <- ifelse(TX$Sequence %in% L2TX_trim$X, 1, 0) #then use it to fill in the registration variable
TX$voted2020gen <- ifelse(TX$Sequence %in% L2TX_trim[L2TX_trim$General_2020=="Y",]$X, 1, 0) #then use it to fill in the turnout variable

TX$registrationdate <-L2TX_trim$officialregdate[match(TX$Sequence,L2TX_trim$X)]  
#can we also pull in party reg where available?
table(L2TX_trim$Parties_Description) #need to verify with L2 that this is from the file and not modeled. 

TX$party <- L2TX_trim$Parties_Description[match(TX$Sequence, L2TX_trim$X)]
table(TX$party, exclude=NULL)

#also pull in case covars from state conviction database
TXcovars <- read.csv("../Data/TXconviction/texas_covars.csv")

TXcovars$lastincarcend <- as.Date(TXcovars$sentenceend_cust) #convert to date to be comparable to NC
TXcovarstrim <- subset(TXcovars, select=c("nameID", "still_in_custody", "no_end_date", "ever_in_custody", "ever_under_supervision", "any_felony", "lastincarcend")) #and can pull in other covars here as we want to incorporate into analysis
TXmerged <- merge(TX, TXcovarstrim, by="nameID"); dim(TXmerged); dim(TX) 

table(TXmerged$prereg2020)

TX <- TXmerged #just pull the whole thing through rather than subsetting it here. 


############################################################################################
## Bind NC data and TX data
############################################################################################
# clean NC data
NC$prereg2020 <- already_reg <- NC$no_end_date <- NC$still_in_custody <- NA #line up columns across states so they stack neatly
NC$any_felony <- 1 

# clean TX data
TX$registeredonline <- NA
TX$RUCA1 <- NA
colnames(TX)[which(names(TX) == "ever_under_supervision")] <- "pastsupervision" #line these names up with NC vars
colnames(TX)[which(names(TX) == "ever_in_custody")] <- "pastincarc" 

main <- rbind(TX, NC) 

############################################################################################
## Assign covariates etc
############################################################################################


main$urbanicity <- as.factor(main$RUCA1) #lots of ways to slice this but try just setting as factor?
main$ageyears <-  2020-main$birthyear #approximate age
main$NorthCarolina <- ifelse(main$list=="NCmainlist", 1, 0)
main$Black <- 0
main[(main$NorthCarolina==1 & trimws(main$CMCLRACE)=="BLACK"), "Black"] <- 1 #for NC
main[(main$NorthCarolina==0 & main$raccod=="B"), "Black"] <- 1 #for TX
summary(main$Black)
#then, let's also try to set up white/hispanic indicators, though I should note I'm uncertain about our ability to distinguish using their "ethnicity" data
main$white <- 0
main[(main$NorthCarolina==1 & trimws(main$CMCLRACE)=="WHITE"), "white"] <- 1 #for NC
main[(main$NorthCarolina==0 & main$raccod=="W"), "white"] <- 1 #for TX
summary(main$white)
table(main$raccod) #note that race/ethnicity are tracked in separate fields, so I'm guessing most people they consider hispanic are also getting marked white. think about whether to change that
main$hispanic <- 0
main[(main$NorthCarolina==1 & trimws(main$CMETHNIC)=="HISPANIC/LATINO"), "hispanic"] <- 1 #for NC
main[(main$NorthCarolina==0 & main$ethcod=="H" & !(is.na(main$ethcod))), "hispanic"] <- 1 #for TX
summary(main$hispanic)
table(main$CMETHNIC) #note they only mark 800 people in NC; I'm doubtful about how consistent they are here, though it could also be low rates
table(main$ethcod, exclude = NULL)

#use WRU to identify hispanic identity and then after that rely on dps records for race.
main$surname <- main$last_name
mainwru <- predict_race(main, census.surname = TRUE, surname.only = TRUE)
# dim(mainwru); head(mainwru)
summary(mainwru$pred.his)
mainwru$wruhispanic <- ifelse(((mainwru$pred.his > mainwru$pred.whi) & (mainwru$pred.his > mainwru$pred.bla) & (mainwru$pred.his > mainwru$pred.asi) & 	(mainwru$pred.his > mainwru$pred.oth)), 1, 0) #so if pred.his is larger than other fields, set to hispanic
summary(mainwru$wruhispanic)
#so now I'm going to set up a "race" field with categories hispanic, white, black, other, pretending those are mutually exclusive
mainwru$wrurace <- "other"
mainwru[mainwru$wruhispanic==1, "wrurace"] <- "hispanic"
mainwru[mainwru$wruhispanic==0 & mainwru$Black==1, "wrurace"] <- "black"
mainwru[mainwru$wruhispanic==0 & mainwru$white==1, "wrurace"] <- "white"
table(mainwru$wrurace)
#and split this out into dichotomous variables too.
mainwru$wrublack <- ifelse(mainwru$wrurace=="black", 1,0)
mainwru$wruwhite <- ifelse(mainwru$wrurace=="white", 1,0)
mainwru$wruother <- ifelse(mainwru$wrurace=="other", 1,0)
#now overwrite main dataset so we have all these new columns
main <- mainwru

# saveRDS(object = mainwru, file = "tmp/main_NCTX.rds")
write_csv(mainwru, file = "tmp/main_NCTX.csv") #here is main identified version of project dataset (including names/DOBs, etc, so not for posting)

###########################################################################
# variables to keep for main analysis
###########################################################################
keep <- c("reg2020", "voted2020gen", "Treatment", "ageyears", "male", "wrublack", "wruwhite", "wrurace", "pastincarc", "lastincarcend", "list", "prereg2020", "still_in_custody", "any_felony", "registrationdate", "wruhispanic", "pastsupervision", "CIPRIOFF")

# extract relevant columns for the paper
main <- main[, keep]

# for Figure 1 
wholedeid <- main[, c("Treatment", "reg2020", "voted2020gen","registrationdate","list")]
write.csv(wholedeid, "data/deidentifieddataset_fall2022.csv") 

# file for main analysis
# saveRDS(object = main, file = "data/main_paper.rds")
write_csv(main, file = "data/main_paper.csv")

#now, in spring 2025: multiply outcomes by 100 to make it easier to describe as percentage points for R1, I guess
main <- read_csv("data/main_paper.csv")
main$reg2020 <- main$reg2020*100
main$voted2020gen <- main$voted2020gen*100

# file for main analysis
# saveRDS(object = main, file = "data/main_paper.rds")
write_csv(main, file = "data/main_paper_2025.csv") #here is the main deid dataset for main-paper analyses



###########################################################################
# Next, pull in some additional detail that will be needed for comparison
# to other published work (in replication_samplecomparisons.R)
###########################################################################
main <- read_csv("tmp/main_NCTX.csv")
main <-  mainfull[mainfull$list == "NCmainlist",]



###########################################################################
# Now, construct other deid datasets needed for SI analyses
# starting with data-loss figures
###########################################################################

## this section uses data from NCDPS to produce a deidentified dataset that will be used to compare 
## the broader population of interest to our experimental sample
## This code shows the data processing that leads to "nc_dataloss_describe.csv"

# note it's fixed-width with a separate file describing columns, annoying.
offenderprofilelayout <- read.fwf("../Data/DOPSrecords/OFNT3AA1.des", widths=c(14,34,10,8,6), header=F) #read in the description file (w/o header because it's acting up)
colnames(offenderprofilelayout) <- c("Name", "Description", "Type", "Start", "Length") #name the columns

#now use this key to read in the actual file:
tmp <- read_fwf("../Data/DOPSrecords/OFNT3AA1.dat", 
                guess_max = 30000,
                fwf_widths(offenderprofilelayout$Length, trimws(offenderprofilelayout$Name)))
offenderprofile <- data.table(tmp)

# also, pull in inmate profiles and see how they look (and also Probation/Parole?) using the same approach.
inmateprofilelayout <- read.fwf("../Data/DOPSrecords/INMT4AA1.des", widths=c(14,34,10,8,7), skip=1) #read in the description file
colnames(inmateprofilelayout) <- c("Name", "Description", "Type", "Start", "Length") #name the columns

tmp <- read_fwf("../Data/DOPSrecords/INMT4AA1.dat", 
                guess_max = 100000,
                fwf_widths(inmateprofilelayout$Length, trimws(inmateprofilelayout$Name)))

inmateprofile <- data.table(tmp)

# merge inmates to offenders
offenderinmate <- merge(offenderprofile, inmateprofile, by.x="CMDORNUM", by.y="CIDORNUM", all.x=T) 

# same deal for probation/parole data
PPprofilelayout <- read.fwf("../Data/DOPSrecords/APPT7AA1.des", widths=c(14,34,10,8,7), skip=1) #read in the description file
colnames(PPprofilelayout) <- c("Name", "Description", "Type", "Start", "Length") #name the columns

tmp <- read_fwf("../Data/DOPSrecords/APPT7AA1.dat",
                guess_max = 100000,
                fwf_widths(PPprofilelayout$Length, trimws(PPprofilelayout$Name)))

supervisionprofile <- data.table(tmp)

# this is everybody who's been convicted, with information about incarceration (if they went) and supervision (if they were).
offender_jailsup <- merge(offenderinmate, supervisionprofile, by.x="CMDORNUM", by.y="CDDORNUM", all.x=T)

offender_jailsup[, drop:=0] #flag for people we'll omit
offender_jailsup[trimws(as.character(INMRCDSTA))=="ACTIVE", drop:=1] #currently incarcerated
offender_jailsup[trimws(as.character(INMRCDSTA))=="PAROLED", drop:=1] #supervision data should exclude these, but just in case let's do it here too. 
offender_jailsup[trimws(as.character(CILAMVTY))=="DEATH", drop:=2] #died in system
offender_jailsup[!(trimws(as.character(CIFELONY))=="FELON"), drop:=3] #unlike the commented approach above, let's also drop people where this field is unclear (not felony/misd but question marks)
offender_jailsup[!(trimws(as.character(CDFELONY))=="FELON"), drop:=3]
offender_jailsup[trimws(as.character(PPRCDSTA))=="ACTIVE", drop:=1] #currently under supervision
offender_jailsup[trimws(as.character(GDSTATUS))=="EXPIRED ABSCONDERABX" | trimws(as.character(GDSTATUS))=="OUT OF STATE     OOS" , drop:=3] 

# clean up the demographic variables

offender_jailsup <- offender_jailsup %>% 
  mutate(drop_desc = case_when(drop==0~"Keep",
                                drop==1~"Under supervision",
                                drop==2~"Deceased",
                                drop==3~"Status unclear"),
          race = trimws(as.character(CMCLRACE)),
          race_black = ifelse(race=="BLACK",1,0),
          race_white = ifelse(race=="WHITE",1,0),
          race_asian = ifelse(race=="ASIAN/ORTL",1,0),
          race_native = ifelse(race=="INDIAN",1,0),
          race_unkown = ifelse(c(race=="UNKNOWN"|race=="OTHER"|race=="??????????????????????????????"),1,0),
          dob = as.Date(offender_jailsup$CMCLBRTH),
          age = c((as.Date("2020-11-15")-dob)/365.25),
          age = as.numeric(round(age,digits=0)),
          gender = trimws(as.character(CMCLSEX)),
          hispanic = trimws(as.character(CMETHNIC)),
          hispanic = ifelse(hispanic=="HISPANIC/LATINO",1,0),
          end_date = as.Date(CILAMVDT, format="%Y-%m-%d"),
          treatment_day = as.Date("2020-06-01"),
          time_since_release = as.numeric((treatment_day - end_date)/365.25),
          birth_year = substr(as.character(CMCLBRTH), 1,4))


offender_jailsup$time_since_release[offender_jailsup$time_since_release < 0] <- 0

offender_jailsup %>% 
  dplyr::select(race_black, race_white, race_asian, race_native, race_unkown, hispanic, 
                gender, age, time_since_release, drop, CMCLBRTH, birth_year) -> offender_jailsup_use

# function to compute summary statistics
compute_sum <- function(data, addrow) {
  data %>% 
    summarise(black = sum(race_black),
              white = sum(race_white),
              asian = sum(race_asian),
              native = sum(race_native),
              race_unknown = sum(race_unkown),
              hispanic = sum(hispanic),
              female = sum(gender == "FEMALE"),
              male = sum(gender == "MALE"),
              mean_age = mean(age) ,
              mean_release_time = mean(time_since_release, na.rm=TRUE)) %>% 
    mutate(row = addrow)
}

## summary statistics for DOPS data
start_demos <- compute_sum(data = offender_jailsup_use, addrow = "Universe")
cut1 <- compute_sum(data = offender_jailsup_use %>% filter(drop!=1), addrow = "Under supervision")
cut2 <- compute_sum(data = offender_jailsup_use %>% filter(drop != 1 & drop !=2), addrow = "Deceased")
cut3 <- compute_sum(data = offender_jailsup_use %>% filter(drop != 1 & drop !=2 & drop != 3), 
                    addrow = "Status unclear")

# check duplicates

exoffendersformatch <- offender_jailsup[drop==0, ]
exoffendersformatch$last_name <- trimws(ifelse(is.na(exoffendersformatch$CDSLSTNM),
                                                as.character(exoffendersformatch$CICLSTNM), 
                                                as.character(exoffendersformatch$CDSLSTNM)))
exoffendersformatch$first_name <- trimws(ifelse(is.na(exoffendersformatch$CDSFSTNM),
                                                as.character(exoffendersformatch$CICFSTNM), 
                                                as.character(exoffendersformatch$CDSFSTNM)))
exoffendersformatch$middle_name <- trimws(ifelse(is.na(exoffendersformatch$CDSMIDNM),
                                                as.character(exoffendersformatch$CICMIDIN), 
                                                as.character(exoffendersformatch$CDSMIDNM)))

exoffendersformatch[, personcount:=.N, by=list(last_name, first_name,CMCLBRTH)]

cut4 <- compute_sum(data = exoffendersformatch[personcount==1,], addrow = "Duplicates")

# check those who are older than 70
exoffendersformatch[, birth_year:= substr(as.character(CMCLBRTH), 1,4)] #extract birth year from DOB

exoffendersformatch[ , agedrop:= 0]
exoffendersformatch[as.numeric(birth_year)<=1949, agedrop:= 1]
exoffendersformatch[CMCLBRTH == "0001-01-01", agedrop:= 1] #and also flag people with missing DOBs here

# condition by personcount or not
cut5 <- compute_sum(data = exoffendersformatch[agedrop==0 & personcount==1,], addrow = "Over 70")

exoffendersformatch[trimws(as.character(CMCLSEX))=="FEMALE", gender_code:="F"]; 
exoffendersformatch[trimws(as.character(CMCLSEX))=="MALE", gender_code:="M"]

exoffendersformatch[, birth_year:= substr(as.character(CMCLBRTH), 1,4)] #extract birth year from DOB

exoffendersformatch$name_suffix_lbl <- trimws(ifelse(is.na(exoffendersformatch$CDSSUFFX),
                                                      as.character(exoffendersformatch$CICSUFIX), 
                                                      as.character(exoffendersformatch$CDSSUFFX)))#note that in inmate data this is just initial.

cut6 <- compute_sum(data = exoffendersformatch[agedrop==0 & !(is.na(last_name)) & personcount==1,], 
                    addrow = "No last name")


exoff_dropnoncit <- exoffendersformatch[!(trimws(CMCITIZN)=="ALIEN") & agedrop==0 & !(is.na(last_name)) & personcount==1,]; 

cut7 <- compute_sum(data = exoff_dropnoncit, 
                    addrow = "Citizens") 

oldsample1 <- read.csv("../Data/TPsample_50k_nov2019.csv"); dim(oldsample1)
oldsample2 <- read.csv("../Data/DOPSrecords/TPsample_70k_apr2020.csv"); dim(oldsample2)
oldsample3 <- read.csv("../Data/DOPSrecords/TPsample_35k_may2020.csv"); dim(oldsample3)
oldsample <- rbind(oldsample1, oldsample2, oldsample3)
datasetforsample <- exoff_dropnoncit[!(CMDORNUM %in% oldsample$CMDORNUM),]; dim(datasetforsample) #July 2020 

cut8 <- compute_sum(data = datasetforsample, addrow = "Used in pilots")

mainasst <- read.csv("../Data/mainstudyfall2020/mainstudyfall2020_fulltreatmentasstdataset.csv")
NCfull <- NC <- mainasst[mainasst$list == "NCmainlist",]

nc_matched <- NCfull %>% separate(CMCLBRTH, c("month","day","yob"),sep="/",remove=FALSE)
# names(nc_matched)
nc_matched <- nc_matched %>% mutate(race = trimws(as.character(CMCLRACE)),
                                race_black = ifelse(race=="BLACK",1,0),
                                race_white = ifelse(race=="WHITE",1,0),
                                race_asian = ifelse(race=="ASIAN/ORTL",1,0),
                                race_native = ifelse(race=="INDIAN",1,0),
                                race_unkown=ifelse(c(race=="UNKNOWN"|race=="OTHER"|race=="??????????????????????????????"),1,0),
                                age = as.numeric(round(c(2020-as.numeric(yob)),digits=0)),
                                gender = trimws(as.character(CMCLSEX)),
                                hispanic1 = trimws(as.character(CMETHNIC)),
                                hispanic = ifelse(hispanic1=="HISPANIC/LATINO",1,0))


nc_matched$end_date <- as.Date(nc_matched$CILAMVDT, format="%m/%d/%Y")
nc_matched$treatment_day <- as.Date("2020-06-01")
nc_matched$time_since_release <- as.numeric(c(nc_matched$treatment_day - nc_matched$end_date)/365.25)
nc_matched$time_since_release[nc_matched$time_since_release<0] <-0

cut9 <- compute_sum(data = nc_matched, 
                    addrow = "Final: Addresses found")

NC_dataloss <- rbind(start_demos,cut1,cut2,cut3,cut4,cut5,cut6,cut7,cut8,cut9)

NC_dataloss <- NC_dataloss %>% mutate(total_pop = female + male,
                          prop_black = black/total_pop,
                          prop_white = white/total_pop,
                          prop_asian = asian/total_pop,
                          prop_native = native/total_pop,
                          prop_unknown = race_unknown/total_pop,
                          prop_hispanic = hispanic/total_pop,
                          prop_male = male/total_pop,
                          prop_female = female/total_pop,
                          lagged_row = lag(total_pop, 1),
                          num_data_lost = lagged_row - total_pop,
                          prop_data_lost1 = num_data_lost/lagged_row,
                          prop_data_lost = round(prop_data_lost1, digits=3))

NC_dataloss <- NC_dataloss[, c("row", "black", "white", "asian", "native", "race_unknown", 
                            "hispanic", "female", "male", "mean_age", "mean_release_time", 
                            "total_pop", "prop_black", "prop_white", "prop_asian", "prop_native", 
                            "prop_unknown", "prop_hispanic", "prop_male", "prop_female", "lagged_row", 
                            "num_data_lost", "prop_data_lost")]

# save the data loss data
write.csv(NC_dataloss, "tmp/nc_dataloss_describe.csv") #write out deidentified dataset for data-loss analyses shown in SI


###########################################################################
# Continue to construct other deid datasets needed for SI analyses
# Next, comparison-group study data that's used in Tables A2, A13
###########################################################################

## pull in the comparison-group treatment assignment file
comparison <- read.csv("../Data/mainstudyfall2020/comparisongroupstudyfall2020_fulltreatmentasstdataset.csv")

##update june 2021: we got vendor to append YOB (they didn't have DOB for this group) to this file, so can do a merge that's more similar to the main one now.
YOBs <- read.csv("../Data/TP_aug2020/Net New NC Unregistered (TP Results 2)_TPYOBappendjune2021.csv")
YOBs$index <- 1:nrow(YOBs)

compmerge <- merge(comparison, YOBs, by=c("TP.First.Name", "TP.Middle.Name", "TP.Last.Name","TP.Name.Suffix", "TP.Address.1", "TP.Address.2", "TP.Address.3", "TP.City.1", "TP.County.1", "TP.Zip.Code.1"))

comp <- compmerge[!duplicated(compmerge$index),]; dim(comp)
comp$birth_year <- comp$TP.Birth.Year

#flag unique addresses
comp$uniqueadd <- 1
comp[duplicated(comp$TP.Address.1)==T, "uniqueadd"] <- 0
comp[duplicated(comp$TP.Address.1, fromLast = T)==T, "uniqueadd"] <- 0


compsamp <- data.table(comp)
#line up some colnames:
compsamp[, mail_addr1 := TP.Address.1]
compsamp[, mail_city := TP.City.1]
compsamp[, mail_zipcode := as.character(TP.Zip.Code.1)]
compsamp[, county_desc := TP.County.1]
compsamp[, first_name := TP.First.Name]
compsamp[, middle_name := TP.Middle.Name]
compsamp[, last_name := TP.Last.Name]
compsamp[, fullname:= paste(first_name, middle_name, last_name, sep=" ")]
compsamp[, CMDORNUM:=1:.N]


## load voter files for merge
voters <- fread("../Data/mainstudyfall2020/NCvoterdata/ncvoter_Statewide.txt",  
                header=T, 
                colClasses=c(rep("character", 71)), 
                select=c("county_desc", "voter_reg_num", "name_prefx_cd", "first_name", "middle_name", "last_name", "name_suffix_lbl", "birth_year", "birth_age", "voter_status_desc", "voter_status_reason_desc", "res_street_address", "res_city_desc", "state_cd", "zip_code","mail_addr1", "mail_addr2", "mail_addr3", "mail_addr4", "mail_city", "mail_state", 		"mail_zipcode","registr_dt", "race_code","ethnic_code","party_cd","gender_code","birth_state")) 

voters[, voterID := paste(county_desc, voter_reg_num, sep="")] 
voters[, firstinitial:= substr(first_name, 1, 1)]
voters[, registrationdate := as.Date(registr_dt, "%m/%d/%Y")]
newvoters <- voters[registrationdate >= "2020-08-01" & registrationdate <= "2020-11-03",] 
newvoters[, birth_year := as.numeric(birth_year)]
dim(newvoters)
regcounties <- table(newvoters$county_desc); dim(regcounties) #all counties are reporting new reg. voters (as it should be)

#also want to pull in vote hist (2020)
votehist <- fread("../Data/mainstudyfall2020/NCvoterdata/ncvhis_Statewide.txt", 
                  header=T, colClasses=c(rep("character", 15)),
                  select=c("county_desc", "voter_reg_num", "election_lbl", "election_desc", "voted_party_cd"))

votehist[, voterID := paste(county_desc, voter_reg_num, sep="")] #stick county name and voter reg 
votes2020gen<- votehist[votehist$election_lbl=="11/03/2020",]; dim(votes2020gen) #should be 5.54mil votes, according to NCSBE
#note 2.8.21: I found our late-Nov voter file pull didn't have full vote history updated, so I've pulled in an updated vote-history file from the state, but left reg. alone
# counties <- table(votes2020gen$county_desc); dim(counties) #all 100 counties reporting
newvoters[, voted2020gen:=0]
newvoters[voterID %in% votes2020gen$voterID, voted2020gen:=1] 
summary(newvoters$voted2020gen)

newvoters[, onlinefieldsmiss:= 0]
newvoters[gender_code=="U" & race_code=="U" & birth_state=="", onlinefieldsmiss:= 1]
newvoters[, fullname:= paste(first_name, middle_name, last_name, sep=" ")]


## merge voter file data
# start with exact name/address matches, then broaden out. 
variablenames <- c("last_name","first_name", "middle_name", "mail_addr1", "mail_city", "mail_zipcode", "county_desc", "birth_year")
exactmatches <- merge(newvoters, compsamp, by=variablenames)
dim(exactmatches)
#okay, but are there other people we're missing? 
#are there people who match exactly on name (first/last, maybe middle if present)? plus address closeness?
exactnamematches1 <- merge(newvoters, compsamp, by=c("last_name","first_name", "middle_name", "birth_year"))
dim(exactnamematches1)
exactnamematches1 <- exactnamematches1[!(CMDORNUM %in% exactmatches$CMDORNUM),] #drop the matches we already found
dim(exactnamematches1); head(subset(exactnamematches1, select=c("last_name","first_name", "middle_name", "mail_addr1.x", "mail_city.x", "mail_zipcode.x", "county_desc.x", "mail_addr1.y", "mail_city.y", "mail_zipcode.y", "county_desc.y")))
exactnamematches1[county_desc.x==county_desc.y, countymatch:=1]; table(exactnamematches1$countymatch) #most are in diff. county
exactnamematches1[mail_city.x==mail_city.y, citymatch:=1]; table(exactnamematches1$citymatch) #some in same city
exactnamematches1[mail_zipcode.x==mail_zipcode.y, zipmatch:=1]; table(exactnamematches1$zipmatch) #fewer in same zip
exactnamematches1 <- exactnamematches1[countymatch==1,] #geographic limitation


firstlastnames <- merge(newvoters, compsamp, by=c("last_name","first_name", "birth_year")) #drop middle name criterion
dim(firstlastnames)
firstlastnames1 <- firstlastnames[!(CMDORNUM %in% exactmatches$CMDORNUM) & !(CMDORNUM %in% exactnamematches1$CMDORNUM),] #drop the matches we already found
dim(firstlastnames1) #okay, this is going to be out of control
firstlastnames1[,mnamelengthx:= nchar(middle_name.x)] #how many characters in this field?
firstlastnames1[,mnamelengthy:= nchar(middle_name.y)] 
firstlastnames1[mnamelengthx==1 & middle_name.x == substr(middle_name.y,0,1), keep:=1] #initial-only matches initialized version: X/Y
firstlastnames1[mnamelengthy==1 & middle_name.y == substr(middle_name.x,0,1), keep:=1] # same but for Y/X

firstlastnames2 <- firstlastnames1[keep==1,] #keep just the ones we flagged
dim(firstlastnames2); head(subset(firstlastnames2, select=c("last_name","first_name", "middle_name.x", "middle_name.y","birth_year", "mail_addr1.x", "mail_city.x", "mail_zipcode.x", "county_desc.x", "mail_addr1.y", "mail_city.y", "mail_zipcode.y", "county_desc.y")))
firstlastnames2[county_desc.x==county_desc.y, countymatch:=1]; table(firstlastnames2$countymatch) #some are in same county
firstlastnames2[mail_city.x==mail_city.y, citymatch:=1]; table(firstlastnames2$citymatch) #some in same city
firstlastnames2[mail_zipcode.x==mail_zipcode.y, zipmatch:=1]; table(firstlastnames2$zipmatch) #some in same zip
length(unique(firstlastnames2$CMDORNUM)) #again, just a handful of duplicates
firstlastnames2 <- firstlastnames2[countymatch==1,] 

#but also, what about people who register at the same address we have, with similar name?
exactaddressYOBmatches1 <- merge(newvoters, compsamp, by=c("mail_addr1", "mail_city", "mail_zipcode", "birth_year"))
dim(exactaddressYOBmatches1) #lots, but probably lots of completely different names. 
exactaddressYOBmatches1[, fnamedist := stringdist(first_name.x, first_name.y, method="jw")] #calculate string distances between these names
exactaddressYOBmatches1[, mnamedist := stringdist(middle_name.x, middle_name.y, method="jw")] 
exactaddressYOBmatches1[, lnamedist := stringdist(last_name.x, last_name.y, method="jw")] 
summary(exactaddressYOBmatches1$lnamedist)
exactaddressYOBmatches1[, fullnamedist := stringdist(fullname.x, fullname.y, method="jw")] #in case people use maiden as middle name? we'll see
summary(exactaddressYOBmatches1$fullnamedist)#look through these and decide whether to include. 
head(subset(exactaddressYOBmatches1, select=c("fullname.x", "fullname.y","fullnamedist")), 20)
#this is helping us catch married names

exactaddressYOBmatches2 <- exactaddressYOBmatches1[(fnamedist <.25 & lnamedist <.25) | fullnamedist <.25,]; dim(exactaddressYOBmatches2) #trim down to closer name matches
summary(exactaddressYOBmatches2$lnamedist) #note these are mostly exact matches, actually
#middle names trickier, since some are missing and some are only initials
exactaddressYOBmatches2[,mnamelengthx:= nchar(middle_name.x)] #how many characters in this field?
exactaddressYOBmatches2[,mnamelengthy:= nchar(middle_name.y)]
summary(exactaddressYOBmatches2$mnamelengthx)
exactaddressYOBmatches2[ (mnamelengthx >1 & mnamelengthy >1) & mnamedist >.1, mnamemismatch:=1]; table(exactaddressYOBmatches2$mnamemismatch)
exactaddressYOBmatches3 <- exactaddressYOBmatches2[is.na(mnamemismatch),]; dim(exactaddressYOBmatches3)
 
addressYOBmatches <- exactaddressYOBmatches3[!(CMDORNUM %in% exactmatches$CMDORNUM) & !(CMDORNUM %in% firstlastnames2$CMDORNUM)& !(CMDORNUM %in% exactnamematches1$CMDORNUM),]; dim(addressYOBmatches)

allmatches <- rbind(exactmatches, addressYOBmatches, exactnamematches1, firstlastnames2, fill=TRUE) #stick together all these match approaches
dim(allmatches); length(unique(allmatches$CMDORNUM)); dim(unique(allmatches))


## check voter registration status
allmatches <- allmatches[voter_status_desc =="ACTIVE",]; dim(allmatches) #can comment out if we want to look at efforts to be reg.
comp$reg2020 <- ifelse(compsamp$CMDORNUM %in% allmatches$CMDORNUM, 1, 0) #then use it to fill in the registration variable

comp$voted2020gen <- 0 #also, turnout
comp[compsamp$CMDORNUM %in% allmatches[allmatches$voted2020gen==1,]$CMDORNUM, "voted2020gen"] <- 1
table(comp$voted2020gen)

#also, look at partisanship among registrants as we do for main sample.
comp$party <- allmatches$party_cd[match(compsamp$CMDORNUM,allmatches$CMDORNUM)]



## load main data
mainasst <- read.csv("../Data/mainstudyfall2020/mainstudyfall2020_fulltreatmentasstdataset.csv")
NC <- mainasst[mainasst$list=="NCmainlist",] #main NC list
NCsamp <- data.table(NC)

#line up some colnames:
NCsamp[, mail_addr1 := TP.Address.1]
NCsamp[, mail_city := TP.City.1]
NCsamp[, mail_zipcode := as.character(TP.Zip.Code.1)]
NCsamp[, county_desc := TP.County.1]
NCsamp[, dpsname:= paste(first_name, last_name, sep=" ")]
NCsamp[, fullname:= paste(first_name, middle_name, last_name, sep=" ")]


#redoing main-file merge to be parallel here
newvoters[, fullname:= paste(first_name, middle_name, last_name, sep=" ")]

#then here's the manual merge code (again, pulled pretty much verbatim from "NC_march2020pilot_analysisfirstcut.R")
# start with exact name/yob/address matches, then broaden out. 
variablenames <- c("last_name","first_name", "middle_name", "birth_year", "mail_addr1", "mail_city", "mail_zipcode", "county_desc")
exactmatches <- merge(newvoters, NCsamp, by=variablenames)
dim(exactmatches)
exactnamematches1 <- merge(newvoters, NCsamp, by=c("last_name","first_name", "middle_name", "birth_year"))
dim(exactnamematches1)
exactnamematches1 <- exactnamematches1[!(CMDORNUM %in% exactmatches$CMDORNUM),] #drop the matches we already found
exactnamematches1[county_desc.x==county_desc.y, countymatch:=1]; table(exactnamematches1$countymatch) #most are in same county
exactnamematches1[mail_city.x==mail_city.y, citymatch:=1]; table(exactnamematches1$citymatch) #some in same city
exactnamematches1[mail_zipcode.x==mail_zipcode.y, zipmatch:=1]; table(exactnamematches1$zipmatch) #fewer in same zip
length(unique(exactnamematches1$CMDORNUM)) #couple of duplicates
exactnamematches1 <- exactnamematches1[countymatch==1,] 

firstlastnames <- merge(newvoters, NCsamp, by=c("last_name","first_name", "birth_year")) #drop middle name criterion
dim(firstlastnames)
firstlastnames1 <- firstlastnames[!(CMDORNUM %in% exactmatches$CMDORNUM) & !(CMDORNUM %in% exactnamematches1$CMDORNUM),] #drop the matches we already found
dim(firstlastnames1)
firstlastnames1[,mnamelengthx:= nchar(middle_name.x)] #how many characters in this field?
firstlastnames1[,mnamelengthy:= nchar(middle_name.y)] 
firstlastnames1[mnamelengthx==1 & middle_name.x == substr(middle_name.y,0,1), keep:=1] #initial-only matches initialized version: X/Y
firstlastnames1[mnamelengthy==1 & middle_name.y == substr(middle_name.x,0,1), keep:=1] # same but for Y/X

firstlastnames2 <- firstlastnames1[keep==1,] #keep just the ones we flagged
dim(firstlastnames2); head(subset(firstlastnames2, select=c("last_name","first_name", "middle_name.x", "middle_name.y","birth_year", "mail_addr1.x", "mail_city.x", "mail_zipcode.x", "county_desc.x", "mail_addr1.y", "mail_city.y", "mail_zipcode.y", "county_desc.y")))
firstlastnames2[county_desc.x==county_desc.y, countymatch:=1]; table(firstlastnames2$countymatch) #most are in same county
firstlastnames2[mail_city.x==mail_city.y, citymatch:=1]; table(firstlastnames2$citymatch) #many in same city
firstlastnames2[mail_zipcode.x==mail_zipcode.y, zipmatch:=1]; table(firstlastnames2$zipmatch) #some in same zip
length(unique(firstlastnames2$CMDORNUM)) #again, just a handful of duplicates
firstlastnames2 <- firstlastnames2[countymatch==1,] 

#but also, what about people who register at the same address we have, with same YOB and similar name?
exactaddressYOBmatches1 <- merge(newvoters, NCsamp, by=c("birth_year", "mail_addr1", "mail_city", "mail_zipcode"))
dim(exactaddressYOBmatches1) #lots, but probably lots of completely different names. 
exactaddressYOBmatches1[, fnamedist := stringdist(first_name.x, first_name.y, method="jw")] #calculate string distances between these names
exactaddressYOBmatches1[, mnamedist := stringdist(middle_name.x, middle_name.y, method="jw")] 
exactaddressYOBmatches1[, lnamedist := stringdist(last_name.x, last_name.y, method="jw")] 
summary(exactaddressYOBmatches1$lnamedist)
exactaddressYOBmatches1[, fullnamedist := stringdist(fullname.x, fullname.y, method="jw")] #in case people use maiden as middle name? we'll see
summary(exactaddressYOBmatches1$fullnamedist)#look through these and decide whether to include. 
head(subset(exactaddressYOBmatches1, select=c("fullname.x", "fullname.y","fullnamedist")), 20)

exactaddressYOBmatches2 <- exactaddressYOBmatches1[(fnamedist <.25 & lnamedist <.25) | fullnamedist <.25,]; dim(exactaddressYOBmatches2) #trim down to closer name matches
summary(exactaddressYOBmatches2$lnamedist) #note these are mostly exact matches, actually
#middle names trickier, since some are missing and some are only initials
exactaddressYOBmatches2[,mnamelengthx:= nchar(middle_name.x)] #how many characters in this field?
exactaddressYOBmatches2[,mnamelengthy:= nchar(middle_name.y)]
summary(exactaddressYOBmatches2$mnamelengthx)
exactaddressYOBmatches2[ (mnamelengthx >1 & mnamelengthy >1) & mnamedist >.1, mnamemismatch:=1]; table(exactaddressYOBmatches2$mnamemismatch)
exactaddressYOBmatches3 <- exactaddressYOBmatches2[is.na(mnamemismatch),]; dim(exactaddressYOBmatches3) 
addressYOBmatches <- exactaddressYOBmatches3[!(CMDORNUM %in% exactmatches$CMDORNUM) & !(CMDORNUM %in% firstlastnames2$CMDORNUM) & !(CMDORNUM %in% exactnamematches1$CMDORNUM),]; dim(addressYOBmatches)

allmatches2 <- rbind(exactmatches, addressYOBmatches, exactnamematches1, firstlastnames2, fill=TRUE) #stick together all these match approaches
dim(allmatches2); length(unique(allmatches2$CMDORNUM)); dim(unique(allmatches2))
table(allmatches2$voter_status_desc)

NC$reg2020 <- ifelse(NC$CMDORNUM %in% allmatches2$CMDORNUM, 1, 0) #then use it to fill in the registration variable
table(NC$reg2020)

NC$voted2020gen <- 0 #also, turnout
NC[NC$CMDORNUM %in% allmatches2[allmatches2$voted2020gen==1,]$CMDORNUM, "voted2020gen"] <- 1
table(NC$voted2020gen)

NC$party <- allmatches2$party_cd[match(NC$CMDORNUM,allmatches2$CMDORNUM)]
NC$anytreat <- ifelse(NC$Treatment=="T1", 0,1); table(NC$anytreat)

## save deid files for analysis
write_csv(comp[, c("party", "Treatment", "reg2020", "voted2020gen")], file = "data/comparison_party_treatment.csv")
write_csv(NC[, c("party", "Treatment", "reg2020", "voted2020gen", "anytreat")], file = "data/NC_party_treatment.csv")

###########################################################################
# Continue to construct other deid datasets needed for SI analyses
# Next, postcard-bounce data used for Table A9
###########################################################################

#going to pull in a bunch of lists we received back from mail vendor tracking bounces
#combine those, then merge to main treatment lists to see whose postcards bounced
scans <- as.data.frame(read_excel("../Data/postcards/MITMATCHES_9_1 THRU 9_15.xlsx", guess_max = 10000))
dim(scans)#spreadsheet includes only observations for addresses that were not deliverable

#Making a flag for which Demar list an obs comes from
scans$firstlist<-1
scans$secondlist<-0
scans$thirdlist<-0
scans$fourthlist<-0
scans$fifthlist<-0
scans$sixthlist<-0

#Now bringing in the second set of bounced/undelivered postcards
scans2 <- as.data.frame(read_excel("../Data/postcards/MITMATCHES_9_16 THRU 9_30.xlsx", guess_max = 5000))
dim(scans2)
#OK 8279 observations

#Making a flag for which Demar list an obs comes from
scans2$firstlist<-0
scans2$secondlist<-1
scans2$thirdlist<-0
scans2$fourthlist<-0
scans2$fifthlist<-0
scans2$sixthlist<-0

#Now bringing in the third set of bounced/undelivered postcards
#Removed blank column that was in position BL from this. Was preventing columns from lining up with other batches.
scans3 <- as.data.frame(read_excel("../Data/postcards/MITMATCHES_10_1 THRU_10_15.xlsx", guess_max = 2000))
dim(scans3)
#OK 646 observations

#Making a flag for which Demar list an obs comes from
scans3$firstlist<-0
scans3$secondlist<-0
scans3$thirdlist<-1
scans3$fourthlist<-0
scans3$fifthlist<-0
scans3$sixthlist<-0

#Now bringing in the fourth set of bounced/undelivered postcards
#Removed blank column that was in position BL from this. Was preventing columns from lining up with other batches.
scans4 <- as.data.frame(read_excel("../Data/postcards/MIT_MATCHES_11_1_21_THRU_11_15_21.xlsx", guess_max = 2000))
dim(scans4)

#Making a flag for which Demar list an obs comes from
scans4$firstlist<-0
scans4$secondlist<-0
scans4$thirdlist<-0
scans4$fourthlist<-1
scans4$fifthlist<-0
scans4$sixthlist<-0

#Now bringing in the fifth set of bounced/undelivered postcards
#Saved as .xlsx, removed header tab 
scans5 <- as.data.frame(read_excel("../Data/postcards/MIT_MATCHES_11_16_21_THRU_11_30_21.xlsx", guess_max = 2000))
dim(scans5)

#Making a flag for which Demar list an obs comes from
scans5$firstlist<-0
scans5$secondlist<-0
scans5$thirdlist<-0
scans5$fourthlist<-0
scans5$fifthlist<-1
scans5$sixthlist<-0

#Now bringing in the sixth set of bounced/undelivered postcards
#Saved as .xlsx, removed header tab 
scans6 <- as.data.frame(read_excel("../Data/postcards/MIT_MATCHES_12_1_21_THRU_12_15_21.xlsx", guess_max = 2000))
dim(scans6)

#Making a flag for which Demar list an obs comes from
scans6$firstlist<-0
scans6$secondlist<-0
scans6$thirdlist<-0
scans6$fourthlist<-0
scans6$fifthlist<-0
scans6$sixthlist<-1

#Select important columns from bounced data (can adjust later if needed)
keep<- c("custid", "name", "externalda", "Deliverability Code")
trim<- scans[, keep]; dim(trim)
trim2<- scans2[, keep]; dim(trim2)
trim3<- scans3[, keep]; dim(trim3)
trim4<- scans4[, keep]; dim(trim4)
trim5<- scans5[, keep]; dim(trim5)
trim6<- scans6[, keep]; dim(trim6)

scanstrimmed <- rbind(trim,trim2,trim3,trim4,trim5,trim6); dim(scanstrimmed)

# Going to try to merge these with the address list that was originally sent to Demar
postcardsfull<- read.csv("../Data/mainstudyfall2020/postcardmailinglist_summer2021.csv")
postcardsfull$custid <- postcardsfull$X

#Merging with complete address list, and keeping just those that merge (so the bounced)
addmerge <- merge(postcardsfull, scanstrimmed, all.y = TRUE, by= c("custid")); dim(addmerge)
#Merge looks good
#Trimming again before merging with main datasets
keep2<-c("custid", "first_name", "last_name", "name", "mailingaddress", "mailingcity", "mailingstate", "mailingzip", "list", "externalda", "svycode", "Deliverability Code", "svycode")
bounce<- addmerge[,keep2]; dim(bounce)

#Reading in the main study data
maindata <- as.data.frame(read.csv("../Data/mainstudyfall2020/mainstudyfall2020_fulltreatmentasstdataset.csv"))

#Reading in the comparison study data
comparisondata <- as.data.frame(read.csv("../Data/mainstudyfall2020/comparisongroupstudyfall2020_fulltreatmentasstdataset.csv"))

#First and last names, middle name
comparisondata$last_name<- trimws(comparisondata$TP.Last.Name)
comparisondata$first_name<- trimws(comparisondata$TP.First.Name)
comparisondata$middle_name<- trimws(comparisondata$TP.Middle.Name)

#Address fields
comparisondata$mailingaddress<- trimws(comparisondata$TP.Address.1)
comparisondata$mailingzip<- trimws(comparisondata$TP.Zip.Code.1)
comparisondata$mailingstate<- trimws(comparisondata$TP.State.1)
comparisondata$mailingcity<- trimws(comparisondata$TP.City.1)

#Making indicators for each dataset to check the merge
bounce$bouncelist <- 1
maindata$mainlist <- 1
comparisondata$comparisonlist <- 1

####Merging the main and bounced list dataframes
table (bounce$list)
#Doing this separately for the NC main study and comparison group, since only have demographics for those in the main study
bounce_m<- subset(bounce, list == "NCmainlist")
mainbounce <- merge(maindata, bounce_m, by= c("last_name", "mailingaddress"), all.y = TRUE); dim(mainbounce)
table(mainbounce$bouncelist , exclude=NULL) 
table(mainbounce$mainlist , exclude=NULL) 

#Just looking at some tables quickly, but first subset maindata to pull out NC main study
table(maindata$list)
mainNC<-subset(maindata, list == "NCmainlist")

# time since release, bounced
mainbounce$CILAMVDT1<-as.factor(mainbounce$CILAMVDT)
mainbounce$CILAMVDT2<-strptime(mainbounce$CILAMVDT1,format="%y/%d/%Y")
mainbounce$end_date <- as.Date(mainbounce$CILAMVDT2, format="%Y-%m-%d")

mainbounce$treatment_day <- as.Date("2020-06-01")
mainbounce$time_since_release <- as.numeric(c(mainbounce$treatment_day - mainbounce$end_date)/365.25)
mainbounce$time_since_release[mainbounce$time_since_release<0] <-0

# time since release, main NC
mainNC$CILAMVDT1<-as.factor(mainNC$CILAMVDT)
mainNC$CILAMVDT2<-strptime(mainNC$CILAMVDT1,format="%y/%d/%Y")
mainNC$end_date <- as.Date(mainNC$CILAMVDT2, format="%Y-%m-%d")

mainNC$treatment_day <- as.Date("2020-06-01")
mainNC$time_since_release <- as.numeric(c(mainNC$treatment_day - mainNC$end_date)/365.25)
mainNC$time_since_release[mainNC$time_since_release<0] <-0

mainNC$bounced <- 0
mainNC[mainNC$CMDORNUM %in% mainbounce$CMDORNUM, "bounced"] <- 1
table(mainNC$bounced)

#now calculate some means, comparing landed/bounced
table(mainNC$bounced, mainNC$male)
mainNC$black <- ifelse(trimws(as.character(mainNC$CMCLRACE))=="BLACK", 1, 0) #set up a simple binary variable using the code from above
mainNC$age <- 2021-mainNC$birth_year

keepcols <- c("bounced","male", "black","age","time_since_release")
write_csv(mainNC, file = "data/NCpostcards_deid.csv")


############################################################################################
## Continue to construct other deid datasets needed for SI analyses
## Next, pilots dataset for SI tabs A11-A12
############################################################################################

##pilots: pull in the original data and keep address, etc. to match to file again
##need to trim these and align all the column formats/names
##what we need: name, DOB, all the mailing address columns, plus which pilot they were in and which arm

##pilot 1 (early 2020):
pilot1 <- data.table(read.csv("../Data/november2019pilot/fulltreatmentasstgroup.csv", stringsAsFactors = FALSE))
keepcols1 <- c("CMDORNUM", "last_name", "first_name", "middle_name", "birth_year", "gender_code", "CMCLRACE", "TP.Street.Address.1", "TP.City.1", "TP.State.1", "TP.Zip.Code.1", "TP.County.1", "treatmentasst", "CICLSTNM", "CILAMVDT") #note we could also pull through more DPS fields if we want incarc background stuff
NCpilot1 <- pilot1[, ..keepcols1]
NCpilot1$pilotnum <- 1

##pilot 2 (march 2020):
pilot2 <- data.table(read.csv("../Data/march2020pilot/fulltreatmentasstgroup.csv", stringsAsFactors = FALSE))
dim(pilot2)
keepcols2 <- c("CMDORNUM", "last_name", "first_name", "middle_name", "birth_year", "gender_code", "CMCLRACE", "TP.Street.Address.1", "TP.City.1", "TP.State.1", "TP.Zip.Code.1", "TP.County.1", "treatmentasst", "CICLSTNM", "CILAMVDT") #T1 = control 
NCpilot2 <- pilot2[, ..keepcols2]
NCpilot2$pilotnum <- 2

##pilot 3 (june 2020):
pilot3 <- data.table(read.csv("../Data/May2020pilot/pilot3_fulltreatmentasstgroup.csv", stringsAsFactors = FALSE))
dim(pilot3)
pilot3$TP.Street.Address.1 <- pilot3$TP.Address.1 #rename to line up
keepcols3 <- c("CMDORNUM", "last_name", "first_name", "middle_name", "birth_year", "gender_code", "CMCLRACE", "TP.Street.Address.1","TP.City.1", "TP.State.1", "TP.Zip.Code.1", "TP.County.1", "treatmentasst", "CICLSTNM", "CILAMVDT")  #again, T1=control
NCpilot3 <- pilot3[, ..keepcols3]
NCpilot3$pilotnum <- 3

#set up a new simpler treatment indicator: 0 if control, 1 if ANY treatment arm
NCpilot1$treat <- ifelse(NCpilot1$treatmentasst=="Treat", 1, 0); table(NCpilot1$treat)
NCpilot2$treat <- ifelse(NCpilot2$treatmentasst=="T1", 0, 1); table(NCpilot2$treat)
NCpilot3$treat <- ifelse(NCpilot3$treatmentasst=="T1", 0, 1); table(NCpilot3$treat)

NC <- rbind(NCpilot1, NCpilot2, NCpilot3)


## load voter file data
voters <- fread("../Data/mainstudyfall2020/NCvoterdata/ncvoter_Statewide.txt",  header=T, colClasses=c(rep("character", 71)), select=c("county_desc", 	"voter_reg_num", "name_prefx_cd", "first_name", "middle_name", "last_name", "name_suffix_lbl", "birth_year", "birth_age", "voter_status_desc", "voter_status_reason_desc", "res_street_address", "res_city_desc", "state_cd", "zip_code","mail_addr1", "mail_addr2", "mail_addr3", "mail_addr4", "mail_city", "mail_state", 		"mail_zipcode","registr_dt", "race_code","ethnic_code","party_cd","gender_code","birth_state")) 

voters[, voterID := paste(county_desc, voter_reg_num, sep="")] 
voters[, firstinitial:= substr(first_name, 1, 1)]
voters[, registrationdate := as.Date(registr_dt, "%m/%d/%Y")]
newvoters <- voters[registrationdate >= "2019-11-01" & registrationdate <= "2020-11-03",] 
newvoters[, birth_year := as.numeric(birth_year)]
dim(newvoters)
regcounties <- table(newvoters$county_desc); dim(regcounties) #all counties are reporting new reg. voters (as it should be)
rm(voters) #clear out memory

#this time we also want to pull in vote hist  and add that in before merge. 
votehist <- fread("../Data/mainstudyfall2020/NCvoterdata/ncvhis_Statewide.txt", header=T, colClasses=c(rep("character", 15)),
	select=c("county_desc", "voter_reg_num", "election_lbl", "election_desc", "voted_party_cd")); dim(votehist)
votehist[, voterID := paste(county_desc, voter_reg_num, sep="")] 
votes2020gen <- votehist[votehist$election_lbl=="11/03/2020",]; dim(votes2020gen) #should be 5.54mil votes, according to NCSBE
newvoters[, voted2020gen:=0]
newvoters[voterID %in% votes2020gen$voterID, voted2020gen:=1] 
summary(newvoters$voted2020gen)
# also pull in primary vote (will want for pilot 1)
votes2020prim <- votehist[votehist$election_lbl=="03/03/2020",]; dim(votes2020gen) 
newvoters[, voted2020prim:=0]
newvoters[voterID %in% votes2020prim$voterID, voted2020prim:=1] 
summary(newvoters$voted2020prim)

## merge with voter file data
newvoters[, fullname:= paste(first_name, middle_name, last_name, sep=" ")]

#okay, now get ready to merge
NCsamp <- data.table(NC)

#line up some colnames:
NCsamp[, mail_addr1 := TP.Street.Address.1]
NCsamp[, mail_city := TP.City.1]
NCsamp[, mail_zipcode := as.character(TP.Zip.Code.1)]
NCsamp[, county_desc := TP.County.1]
NCsamp[, dpsname:= paste(first_name, last_name, sep=" ")]
NCsamp[, fullname:= paste(first_name, middle_name, last_name, sep=" ")]

#then here's the manual merge code (again, pulled pretty much verbatim from previous code)
# start with exact name/yob/address matches, then broaden out. 
variablenames <- c("last_name","first_name", "middle_name", "birth_year", "mail_addr1", "mail_city", "mail_zipcode", "county_desc")
exactmatches <- merge(newvoters, NCsamp, by=variablenames)
dim(exactmatches)
exactnamematches1 <- merge(newvoters, NCsamp, by=c("last_name","first_name", "middle_name", "birth_year"))
dim(exactnamematches1)
exactnamematches1 <- exactnamematches1[!(CMDORNUM %in% exactmatches$CMDORNUM),] 
dim(exactnamematches1); head(subset(exactnamematches1, select=c("last_name","first_name", "middle_name", "birth_year", "mail_addr1.x", "mail_city.x", "mail_zipcode.x", "county_desc.x", "mail_addr1.y", "mail_city.y", "mail_zipcode.y", "county_desc.y")))
exactnamematches1[county_desc.x==county_desc.y, countymatch:=1]; table(exactnamematches1$countymatch) 
exactnamematches1[mail_city.x==mail_city.y, citymatch:=1]; table(exactnamematches1$citymatch) 
exactnamematches1[mail_zipcode.x==mail_zipcode.y, zipmatch:=1]; table(exactnamematches1$zipmatch)
exactnamematches1 <- exactnamematches1[countymatch==1,] #add in a geographic limitation here (comment out if not using)

firstlastnames <- merge(newvoters, NCsamp, by=c("last_name","first_name", "birth_year")) #drop middle name criterion
dim(firstlastnames)
firstlastnames1 <- firstlastnames[!(CMDORNUM %in% exactmatches$CMDORNUM) & !(CMDORNUM %in% exactnamematches1$CMDORNUM),] 
dim(firstlastnames1)
firstlastnames1[,mnamelengthx:= nchar(middle_name.x)] 
firstlastnames1[,mnamelengthy:= nchar(middle_name.y)] 
firstlastnames1[mnamelengthx==1 & middle_name.x == substr(middle_name.y,0,1), keep:=1] #initial-only matches initialized version: X/Y
firstlastnames1[mnamelengthy==1 & middle_name.y == substr(middle_name.x,0,1), keep:=1] # same but for Y/X

firstlastnames2 <- firstlastnames1[keep==1,] #keep just the ones we flagged
dim(firstlastnames2); head(subset(firstlastnames2, select=c("last_name","first_name", "middle_name.x", "middle_name.y","birth_year", "mail_addr1.x", "mail_city.x", "mail_zipcode.x", "county_desc.x", "mail_addr1.y", "mail_city.y", "mail_zipcode.y", "county_desc.y")))
firstlastnames2[county_desc.x==county_desc.y, countymatch:=1]; table(firstlastnames2$countymatch) 
firstlastnames2[mail_city.x==mail_city.y, citymatch:=1]; table(firstlastnames2$citymatch) 
firstlastnames2[mail_zipcode.x==mail_zipcode.y, zipmatch:=1]; table(firstlastnames2$zipmatch) 
firstlastnames2 <- firstlastnames2[countymatch==1,] #add in a geographic limitation here (comment out if not using)

exactaddressYOBmatches1 <- merge(newvoters, NCsamp, by=c("birth_year", "mail_addr1", "mail_city", "mail_zipcode"))
dim(exactaddressYOBmatches1) 
exactaddressYOBmatches1[, fnamedist := stringdist(first_name.x, first_name.y, method="jw")] #calculate string distances between these names
exactaddressYOBmatches1[, mnamedist := stringdist(middle_name.x, middle_name.y, method="jw")] 
exactaddressYOBmatches1[, lnamedist := stringdist(last_name.x, last_name.y, method="jw")] 
summary(exactaddressYOBmatches1$lnamedist)
exactaddressYOBmatches1[, fullnamedist := stringdist(fullname.x, fullname.y, method="jw")] #in case people use maiden as middle name
summary(exactaddressYOBmatches1$fullnamedist) 
head(subset(exactaddressYOBmatches1, select=c("fullname.x", "fullname.y","fullnamedist")), 20)

exactaddressYOBmatches2 <- exactaddressYOBmatches1[(fnamedist <.25 & lnamedist <.25) | fullnamedist <.25,]; dim(exactaddressYOBmatches2) 
summary(exactaddressYOBmatches2$lnamedist) 
exactaddressYOBmatches2[,mnamelengthx:= nchar(middle_name.x)] 
exactaddressYOBmatches2[,mnamelengthy:= nchar(middle_name.y)]
summary(exactaddressYOBmatches2$mnamelengthx)
exactaddressYOBmatches2[ (mnamelengthx >1 & mnamelengthy >1) & mnamedist >.1, mnamemismatch:=1]; table(exactaddressYOBmatches2$mnamemismatch)
exactaddressYOBmatches3 <- exactaddressYOBmatches2[is.na(mnamemismatch),]; dim(exactaddressYOBmatches3)

addressYOBmatches <- exactaddressYOBmatches3[!(CMDORNUM %in% exactmatches$CMDORNUM) & !(CMDORNUM %in% firstlastnames2$CMDORNUM),]; dim(addressYOBmatches)

allmatches <- rbind(exactmatches, addressYOBmatches, exactnamematches1, firstlastnames2, fill=TRUE) #stick together all these match approaches
dim(allmatches); length(unique(allmatches$CMDORNUM)); dim(unique(allmatches))

table(allmatches$voter_status_desc)
allmatches <- allmatches[voter_status_desc =="ACTIVE",]; dim(allmatches) 

# saveRDS(object = allmatches, file = "tmp/allmatches_pilots.rds")
# allmatches <- readRDS("tmp/allmatches_pilots.rds")

## add some variables

NC$reg2020 <- ifelse(NC$CMDORNUM %in% allmatches$CMDORNUM, 1, 0) 
table(NC$reg2020)
NC[, voted2020gen:= 0]
NC[CMDORNUM %in% allmatches[allmatches$voted2020gen==1,]$CMDORNUM, voted2020gen:= 1]
table(NC$voted2020gen)
NC[, voted2020prim:= 0]
NC[CMDORNUM %in% allmatches[allmatches$voted2020prim==1,]$CMDORNUM, voted2020prim:= 1]
table(NC$voted2020prim)

NC[, ageyears:= 2020-birth_year] 
NC[, white := 0]; NC[trimws(CMCLRACE)=="WHITE", white := 1]; 
NC[, black := 0]; NC[trimws(CMCLRACE)=="BLACK", black := 1]; 
NC[, male := 0]; NC[trimws(gender_code)=="M", male := 1];
NC$pastincarc <- ifelse(is.na(NC$CICLSTNM), 0, 1) #if you have no last name from the inmate file, you don't have an inmate record 
NC$lastincarcend <- as.Date(NC$CILAMVDT, "%m/%d/%Y"); summary(NC$lastincarcend) 
NC$dayssincerelease <- as.numeric(as.Date("2020-11-03")- NC$lastincarcend)

NC$treatmentarm <- NC$treatmentasst #set up a treatment indicator that distinguishes between T2 in different pilots
NC[NC$pilotnum==1, "treatmentarm"] <- ifelse(NC[NC$pilotnum==1, "treatmentasst"]=="Treat", "T2P1", "T1")
NC[NC$pilotnum==2 & NC$treatmentasst=="T2", "treatmentarm"] <- "T2P2"
NC[NC$pilotnum==2 & NC$treatmentasst=="T3", "treatmentarm"] <- "T3P2"
NC[NC$pilotnum==3 & NC$treatmentasst=="T2", "treatmentarm"] <- "T2P3"
NC[NC$pilotnum==3 & NC$treatmentasst=="T3", "treatmentarm"] <- "T3P3"
table(NC$treatmentarm)

#now, at JD's request, set this up differently so "basic mailer" always comes through the same?
NC$treatmentarm <- NC$treatmentasst #set up a treatment indicator that distinguishes between T2 in different pilots
NC[NC$pilotnum==1, "treatmentarm"] <- ifelse(NC[NC$pilotnum==1, "treatmentasst"]=="Treat", "T2", "T1")
NC[NC$pilotnum==2 & NC$treatmentasst=="T2", "treatmentarm"] <- "T2"
NC[NC$pilotnum==2 & NC$treatmentasst=="T3", "treatmentarm"] <- "T3P2"
NC[NC$pilotnum==3 & NC$treatmentasst=="T2", "treatmentarm"] <- "T2"
NC[NC$pilotnum==3 & NC$treatmentasst=="T3", "treatmentarm"] <- "T3P3"

#wait, now we also want to try adding in registration as of particular dates. 
NC[, registrationdate := as.Date(allmatches$registrationdate[match(NC$CMDORNUM, allmatches$CMDORNUM)]) ] #fill in reg date from match to VF
summary(NC$registrationdate)
#so for each pilot, we want to calculate registration as of 1month after we went live. 
NC[, reg1monthout := 0]
NC[is.na(registrationdate)==F & pilotnum==1 & registrationdate <= as.Date("2020-02-01") , reg1monthout := 1] #pilot 1: 30 days after jan 2 is feb 1
NC[is.na(registrationdate)==F & pilotnum==2 & registrationdate <= as.Date("2020-04-24") , reg1monthout := 1] #pilot 2: 30 days after march 25 is apr24
NC[is.na(registrationdate)==F & pilotnum==3 & registrationdate <= as.Date("2020-07-22") , reg1monthout := 1] #pilot 3: 30 days after june 22 is july 22

pilotcols <- c("reg1monthout","voted2020prim","voted2020gen","treat","pilotnum", "treatmentarm")
write_csv(NC[,pilotcols], file = "data/NCpilot_deid.csv")
# NC <- read_csv("data/NCpilot.csv")
